library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
library(xgboost)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
library(tidyverse)
## ── Attaching packages ────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.0.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.5
## ✔ tidyr 0.8.1 ✔ stringr 1.3.1
## ✔ readr 1.1.1 ✔ forcats 0.3.0
## ── Conflicts ───────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ dplyr::combine() masks randomForest::combine()
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ lubridate::intersect() masks base::intersect()
## ✖ dplyr::lag() masks stats::lag()
## ✖ ggplot2::margin() masks randomForest::margin()
## ✖ lubridate::setdiff() masks base::setdiff()
## ✖ dplyr::slice() masks xgboost::slice()
## ✖ lubridate::union() masks base::union()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(segmented)
source('functions.r')
load("Table_construction.Rdata")
features = features %>%
# Add other useful information:
inner_join(
data_before %>%
select(person_id, screening_date, people) %>%
unnest() %>%
select(person_id, screening_date, race, sex, name),
by = c("person_id","screening_date")
) %>%
inner_join(features_on, by = c("person_id","screening_date")) %>%
inner_join(outcomes, by = c("person_id","screening_date")) %>%
select(-c(offenses_within_30.y)) %>%
# Create as many features as possible:
mutate(
raw_score = `Risk of Recidivism_raw_score`, # Adjust for violent/general
decile_score = `Risk of Recidivism_decile_score`, # Adjust for violent/general
p_jail30 = pmin(p_jail30,5),
p_prison30 = pmin(p_jail30,5),
p_prison = pmin(p_prison,5),
p_probation = pmin(p_probation,5),
race_black = if_else(race=="African-American",1,0),
race_white = if_else(race=="Caucasian",1,0),
race_hispanic = if_else(race=="Hispanic",1,0),
race_asian = if_else(race=="Asian",1,0),
race_native = if_else(race=="Native American",1,0), # race == "Other" is the baseline
# Subscales:
crim_inv = p_charge+
p_jail30+
p_prison+
p_probation,
# Filters (TRUE for obserations to keep)
filt1 = `Risk of Recidivism_decile_score` != -1, `Risk of Violence_decile_score` != -1,
filt2 = offenses_within_30.x == 1,
filt3 = !is.na(current_offense_date),
filt4 = current_offense_date <= current_offense_date_limit,
filt5 = p_current_age > 18 & p_current_age <= 65,
filt6 = crim_inv == 0
)
Modelling the COMPAS Risk of Recidivism score with a quadratic poly.
features_age_poly = features %>%
filter(filt1,filt5, filt6)
#filter out any individiuals with crim inv history on lb of age poly
lb_age = features_age_poly %>%
group_by(p_current_age) %>%
arrange(raw_score) %>%
top_n(n=-1, wt=raw_score) # Fit lower bound on smallest value
mdl_age = lm(raw_score ~
I(p_current_age^2) +
p_current_age,
data=lb_age)
# More precision for paper
summary(mdl_age)
##
## Call:
## lm(formula = raw_score ~ I(p_current_age^2) + p_current_age,
## data = lb_age)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.22772 -0.01663 0.01004 0.02784 0.14974
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.580e-02 6.114e-02 -0.586 0.56
## I(p_current_age^2) 4.634e-04 3.664e-05 12.646 <2e-16 ***
## p_current_age -7.480e-02 3.096e-03 -24.161 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.05025 on 93 degrees of freedom
## Multiple R-squared: 0.9879, Adjusted R-squared: 0.9877
## F-statistic: 3806 on 2 and 93 DF, p-value: < 2.2e-16
print("Coefficients:")
## [1] "Coefficients:"
sprintf("%.20e",mdl_age$coefficients) # More precision for paper
## [1] "-3.58034341549837223373e-02" "4.63390462694484011469e-04"
## [3] "-7.48013245320391095827e-02"
## Add f(age) to features
features = features %>%
mutate(
f_age = predict(mdl_age, newdata=data.frame(p_current_age=p_current_age)),
raw_score__age_poly = raw_score - f_age,
filt7 = raw_score >= f_age - 0.05
)
## Add same filters to lb_age
lb_age = lb_age %>%
mutate(
f_age = predict(mdl_age, newdata=data.frame(p_current_age=p_current_age)),
raw_score__age_poly = raw_score - f_age,
filt7 = raw_score >= f_age - 0.05
)
Plotting settings
xmin = 18
xmax = 65
xx = seq(xmin,xmax, length.out = 1000)
Generate a preliminary plot of age vs COMPAS general score
ggplot()+
geom_point(aes(x=p_current_age, raw_score, color = factor(filt7)),alpha=.3, data=lb_age) +
geom_line(aes(x=xx, predict(mdl_age, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
theme_bw()+
xlim(xmin,xmax)+
xlab("Age at COMPAS screening date") +
ylab("General score") +
theme(text = element_text(size=18),
axis.text=element_text(size=18),
legend.position="none")
ggplot()+
geom_point(aes(x=p_current_age, raw_score), color="#619CFF",alpha=.3, data=features_age_poly) +
geom_line(aes(x=xx, predict(mdl_age, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
theme_bw()+
xlim(xmin,xmax)+
xlab("Age at COMPAS screening date") +
ylab("General score") +
theme(text = element_text(size=18),
axis.text=element_text(size=18),
legend.position="none")
features_age_poly2 = features %>%
filter(filt1, filt5, filt6, filt7)
lb_filt = features_age_poly2 %>%
group_by(p_current_age) %>%
arrange(raw_score)%>%
top_n(n=-1, wt = raw_score)
#filtered out points
lb_outliers = rbind(
#reason not included in lb_filt was
#age not above age poly
setdiff(lb_age, lb_filt) %>%
filter(!filt7) %>% #below age poly
mutate(reason = "Below age polynomial")
)
lb = lb_filt %>%
select(p_current_age, p_age_first_offense, raw_score) %>%
mutate(reason = "Used to construct linear splines") %>%
rbind(lb_outliers)
Generating linear splines to fit the lower Plottng individuals on new bottom edge produced by fitting to lb_filt individuals.
mdl_age_spline <- segmented(lm(raw_score ~ p_current_age, data = lb_filt),
seg.Z = ~p_current_age, psi = list(p_current_age = c(39,58)),
control = seg.control(display = FALSE)
)
#Add Filter 8
features = features %>%
mutate(
age_spline = predict(mdl_age_spline, newdata=data.frame(p_current_age=p_current_age)),
raw_score__age_spline = raw_score - age_spline,
filt8 = raw_score >= age_spline - 0.05
)
summary.segmented(mdl_age_spline)$psi
## Initial Est. St.Err
## psi1.p_current_age 39 32.73622 0.4637932
## psi2.p_current_age 58 49.65780 0.9842671
break1 = summary.segmented(mdl_age_spline)$psi[1,2]
break2 = summary.segmented(mdl_age_spline)$psi[2,2]
xx1 = seq(xmin,break1, length.out=1000)
xx2 = seq(break1,break2, length.out=1000)
xx3 = seq(break2,xmax, length.out=1000)
age_spline_recid =
ggplot()+
geom_point(aes(x=p_current_age, raw_score, color= factor(reason)),alpha=.5, data=lb ) +
scale_color_manual(values=c("red", "grey25")) +
geom_line(aes(x=xx1, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx1))),
color="darkorange1", size = 1) +
geom_line(aes(x=xx2, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx2))),
color="cyan3", size = 1) +
geom_line(aes(x=xx3, y = predict(mdl_age_spline, newdata=data.frame(p_current_age=xx3))),
color="seagreen3", size = 1) +
theme_bw()+
xlim(xmin,xmax)+
xlab("\n Age at COMPAS screening date") +
ylab("General Score \n") +
theme(text = element_text(size=9),
axis.text=element_text(size=7),
legend.position = c(.95,.95),
legend.title = element_blank(),
legend.justification = c("right", "top"),
legend.key = element_rect(fill = "aliceblue"),
legend.background = element_rect(fill="aliceblue",
size=0.5, linetype="solid")
)
age_spline_recid
ggsave("Figures/age_LB_general.pdf",width = 3.5, height = 2.5, units = "in")
# Age spline with all data (filters 1,5 still applied)
# Red points are below the age polynomial not age spline
ggplot()+
geom_point(aes(x=p_current_age, raw_score), color="#F8766D", data=features %>% filter(filt1,filt5,!filt7)) + # Age outliers
geom_point(aes(x=p_current_age, raw_score), color="#619CFF", alpha=.3, data=features %>% filter(filt1,filt5,filt7)) + # Not age outliers
geom_line(aes(x=xx, predict(mdl_age_spline, newdata=data.frame(p_current_age=xx))),color="#F8766D") +
theme_bw()+
xlim(xmin,xmax)+
xlab("Age at COMPAS screening date") +
ylab("General score") +
theme(text = element_text(size=12),
axis.text=element_text(size=12),
legend.position="top")
ggsave("Figures/age_LB_alldata_general.pdf",width = 3.5, height = 2.5, units = "in")
features %>%
filter(filt1,filt3) %>%
select(crim_inv, raw_score__age_spline, filt8) %>%
ggplot() +
geom_point(aes(x=crim_inv,y=raw_score__age_spline,color=filt8),alpha=.5) +
xlim(c(0,70))+
theme_bw()+
xlab("Sum of Criminal Involvement Components") +
ylab(expression(General~score~-~f[age])) +
theme(text = element_text(size=12),
axis.text=element_text(size=12),
legend.position="top") +
scale_color_manual(name=element_blank(),
breaks=c("TRUE", "FALSE"),
labels=c(expression(Above~f[age]), expression(Below~f[age])),
values=c("TRUE"="#619CFF","FALSE"="#00BA38"))
## Warning: Removed 10 rows containing missing values (geom_point).
ggsave("Figures/crim_inv_LB_general.pdf",width = 3.5, height = 2.5, units = "in")
## Warning: Removed 10 rows containing missing values (geom_point).
Also look at just number of priors
features %>%
filter(filt1,filt3) %>%
select(p_charge, raw_score__age_spline, filt8) %>%
ggplot() +
geom_point(aes(x=p_charge,y=raw_score__age_spline,color=filt8),alpha=.5) +
xlim(c(0,70))+
theme_bw()+
xlab("Number of prior charges") +
ylab(expression(General~score~-~f[age])) +
theme(text = element_text(size=12),
axis.text=element_text(size=12),
legend.position="top") +
scale_color_manual(name=element_blank(),
breaks=c("TRUE", "FALSE"),
labels=c(expression(Above~f[age]), expression(Below~f[age])),
values=c("TRUE"="#619CFF","FALSE"="#00BA38"))
## Warning: Removed 6 rows containing missing values (geom_point).
ggsave("Figures/priors_LB_general.pdf",width = 3.5, height = 2.5, units = "in")
## Warning: Removed 6 rows containing missing values (geom_point).
There are a few groups of predictors we will use: * Group 1: without using age variables or race * Group 2: without using age variables but with race * Group 3: without using race but with age variables * Group 4: using age variables and race
#### Generic stuff (applies to all models)
## Filter rows
features_filt = features %>%
filter(filt1, filt3)
## Set parameters (each combination will be run)
# xgboost
param <- list(objective = "reg:linear",
eval_metric = "rmse",
eta = c(.05,.1),
gamma = c(.5, 1),
max_depth = c(2,5),
min_child_weight = c(5,10),
subsample = c(1),
colsample_bytree = c(1)
)
# svm
param_svm = list(
type = 'eps-regression',
cost = c(0.5,1,2),
epsilon = c(0.5,1,1.5),
gamma_scale = c(0.5,1,2)
)
res_rmse = data.frame(Group = 1:5, lm = NA, xgb = NA, rf = NA, svm = NA)
### Create group 1 training data
## Select features and round count features
train = features_filt %>%
select(
#p_current_age,
p_age_first_offense,
p_charge,
p_jail30,
p_prison,
p_probation,
raw_score__age_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.70479 -0.45320 -0.06662 0.37464 2.52256
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.0500813 0.0177354 59.208 <2e-16 ***
## p_age_first_offense -0.0068003 0.0005579 -12.188 <2e-16 ***
## p_charge 0.0257307 0.0010349 24.862 <2e-16 ***
## p_jail30 -0.0075073 0.0404660 -0.186 0.853
## p_prison 0.2073046 0.0085076 24.367 <2e-16 ***
## p_probation 0.1202088 0.0074718 16.088 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5757 on 9036 degrees of freedom
## Multiple R-squared: 0.4079, Adjusted R-squared: 0.4076
## F-statistic: 1245 on 5 and 9036 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==1,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP
set.seed(923)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 6
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline
res_rmse[res_rmse$Group==1,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(784)
mdl_rf = randomForest(
formula = raw_score__age_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==1,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 11
## type "eps-regression"
## cost "1"
## epsilon "0.5"
## gamma_scale "1"
## gamma "0.1666667"
res_rmse[res_rmse$Group==1,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf, mdl_svm)
### Create group 2 training data
## Select features and round count features
train = features_filt %>%
select(
#p_current_age,
p_age_first_offense,
p_charge,
p_jail30,
p_prison,
p_probation,
race_black,
race_white,
race_hispanic,
race_asian,
race_native, # race == "Other" is the baseline
raw_score__age_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.53127 -0.43741 -0.06343 0.36477 2.37217
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7300096 0.0298034 24.494 < 2e-16 ***
## p_age_first_offense -0.0048329 0.0005678 -8.512 < 2e-16 ***
## p_charge 0.0248485 0.0010184 24.398 < 2e-16 ***
## p_jail30 0.0062706 0.0397791 0.158 0.87475
## p_prison 0.1974029 0.0083911 23.525 < 2e-16 ***
## p_probation 0.1170682 0.0073462 15.936 < 2e-16 ***
## race_black 0.3670750 0.0257221 14.271 < 2e-16 ***
## race_white 0.2429693 0.0259952 9.347 < 2e-16 ***
## race_hispanic 0.0877653 0.0311553 2.817 0.00486 **
## race_asian 0.0964541 0.0859453 1.122 0.26178
## race_native 0.2561655 0.1097032 2.335 0.01956 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5657 on 9031 degrees of freedom
## Multiple R-squared: 0.4286, Adjusted R-squared: 0.428
## F-statistic: 677.5 on 10 and 9031 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==2,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP
set.seed(480)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 14
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "0.5"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline
res_rmse[res_rmse$Group==2,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
data.frame(xgboost = pred, compas=features_filt$raw_score) %>%
ggplot() +
geom_point(aes(x=xgboost,y=compas), alpha=.3) +
theme_bw()+
xlab("XGBoost prediction") +
ylab("COMPAS raw score")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(6778)
mdl_rf = randomForest(
formula = raw_score__age_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==2,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 12
## type "eps-regression"
## cost "2"
## epsilon "0.5"
## gamma_scale "1"
## gamma "0.09090909"
res_rmse[res_rmse$Group==2,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 3 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_charge,
p_jail30,
p_prison,
p_probation,
raw_score__age_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.29274 -0.44847 -0.07196 0.36852 2.53402
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.009042 0.018161 55.561 <2e-16 ***
## p_current_age 0.011531 0.001205 9.567 <2e-16 ***
## p_age_first_offense -0.017721 0.001269 -13.961 <2e-16 ***
## p_charge 0.022992 0.001069 21.511 <2e-16 ***
## p_jail30 0.020861 0.040374 0.517 0.605
## p_prison 0.183270 0.008830 20.755 <2e-16 ***
## p_probation 0.098896 0.007761 12.742 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5729 on 9035 degrees of freedom
## Multiple R-squared: 0.4138, Adjusted R-squared: 0.4134
## F-statistic: 1063 on 6 and 9035 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==3,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP
set.seed(999)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 15
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "1"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline
res_rmse[res_rmse$Group==3,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
ggsave("Figures/raw-fage_xgboost_gp3_general.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(5)
mdl_rf = randomForest(
formula = raw_score__age_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==3,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 20
## type "eps-regression"
## cost "1"
## epsilon "0.5"
## gamma_scale "2"
## gamma "0.2857143"
res_rmse[res_rmse$Group==3,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 2 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_charge,
p_jail30,
p_prison,
p_probation,
race_black,
race_white,
race_hispanic,
race_asian,
race_native, # race == "Other" is the baseline
raw_score__age_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.11700 -0.43169 -0.06356 0.36301 2.38052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.689065 0.029955 23.003 < 2e-16 ***
## p_current_age 0.011449 0.001187 9.646 < 2e-16 ***
## p_age_first_offense -0.015629 0.001254 -12.466 < 2e-16 ***
## p_charge 0.022124 0.001052 21.033 < 2e-16 ***
## p_jail30 0.034811 0.039688 0.877 0.38045
## p_prison 0.173324 0.008714 19.891 < 2e-16 ***
## p_probation 0.095996 0.007629 12.584 < 2e-16 ***
## race_black 0.367757 0.025592 14.370 < 2e-16 ***
## race_white 0.237048 0.025871 9.163 < 2e-16 ***
## race_hispanic 0.094500 0.031006 3.048 0.00231 **
## race_asian 0.105556 0.085516 1.234 0.21711
## race_native 0.244428 0.109155 2.239 0.02516 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5628 on 9030 degrees of freedom
## Multiple R-squared: 0.4345, Adjusted R-squared: 0.4338
## F-statistic: 630.6 on 11 and 9030 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==4,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP
set.seed(23)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 5
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline
res_rmse[res_rmse$Group==4,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=12),
axis.text=element_text(size=12))
ggsave("Figures/raw-fage_xgboost_gp4_general.pdf",width = 3, height = 3, units = "in")
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
highlight = data.frame(
person_id= c(799, 1284, 1394, 1497, 1515, 1638, 3145, 3291, 5722, 6337, 6886, 7997, 8200, 8375, 8491, 10553, 10774, 11231, 11312, 11414),
screening_date = ymd(c("2014-06-15","2014-05-14","2014-11-28","2013-07-29","2013-10-23","2013-10-04","2014-12-14","2013-01-17","2013-10-24","2014-02-04","2013-07-12","2014-04-26","2014-05-05","2013-03-19","2014-01-18","2014-09-20","2013-04-09","2014-02-23","2014-05-02","2014-11-26")),
highlight = TRUE
)
df_plot = features_filt %>%
bind_cols(xgboost = predict(mdl_xgb, newdata=train_xgb)) %>%
left_join(highlight, by = c("person_id","screening_date")) %>%
mutate(highlight = if_else(is.na(highlight), FALSE, TRUE)) %>%
mutate(highlight = factor(if_else(highlight==TRUE,"In Table 5", "Not in Table 5"), levels=c("In Table 5", "Not in Table 5")))
person_id_text_topright = c(8375, 11231, 1515)
#person_id_text_topright = highlight$person_id
person_id_text_topleft = c(1394, 1497)
person_id_text_botright = c(11312, 6886, 8491, 10774)
person_id_text_botleft = c(799)
ggplot() +
geom_point(aes(x=xgboost,y=raw_score, color=highlight), alpha = .3, data = filter(df_plot, highlight=="Not in Table 5")) +
geom_point(aes(x=xgboost,y=raw_score, color=highlight), data = filter(df_plot, highlight=="In Table 5")) +
theme_bw()+
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topright & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="bottom", data=filter(df_plot, person_id %in% person_id_text_topleft & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="left",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botright & highlight=="In Table 5")) +
geom_text(aes(x=xgboost,y=raw_score,label=name),size=3,nudge_x=0, nudge_y=0, hjust="right",vjust="top", data=filter(df_plot, person_id %in% person_id_text_botleft & highlight=="In Table 5")) +
xlab(expression(XGBoost~pred.~of~general~score~-~f[age])) +
ylab("General score")+
theme(
text = element_text(size=12),
axis.text=element_text(size=12),
#legend.position = "top",
legend.position="none") +
scale_color_discrete(name = element_blank()) +
xlim(0.2,3.5)
ggsave("Figures/xgboost_rawScore_annotate_general.pdf",width = 4, height = 4, units = "in")
set.seed(3720)
mdl_rf = randomForest(
formula = raw_score__age_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==4,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 20
## type "eps-regression"
## cost "1"
## epsilon "0.5"
## gamma_scale "2"
## gamma "0.1666667"
res_rmse[res_rmse$Group==4,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
### Create group 5 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_charge,
p_arrest,
p_jail30,
p_prison30,
p_prison,
p_probation,
race_black,
race_white,
race_hispanic,
race_asian,
race_native, # race == "Other" is the baseline
raw_score__age_spline)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score__age_spline) %>% as.matrix(),
"label" = train %>% select(raw_score__age_spline) %>% as.matrix()
)
mdl_lm = lm(raw_score__age_spline ~ ., data=train)
summary(mdl_lm)
##
## Call:
## lm(formula = raw_score__age_spline ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.98368 -0.43395 -0.06177 0.36190 2.38617
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.692209 0.029933 23.126 < 2e-16 ***
## p_current_age 0.012003 0.001192 10.067 < 2e-16 ***
## p_age_first_offense -0.016215 0.001259 -12.875 < 2e-16 ***
## p_charge 0.013844 0.002143 6.460 1.10e-10 ***
## p_arrest 0.007263 0.001638 4.433 9.39e-06 ***
## p_jail30 0.022716 0.039741 0.572 0.56761
## p_prison30 NA NA NA NA
## p_prison 0.168931 0.008761 19.282 < 2e-16 ***
## p_probation 0.083112 0.008156 10.190 < 2e-16 ***
## race_black 0.368977 0.025567 14.432 < 2e-16 ***
## race_white 0.238644 0.025847 9.233 < 2e-16 ***
## race_hispanic 0.096824 0.030978 3.126 0.00178 **
## race_asian 0.106359 0.085428 1.245 0.21316
## race_native 0.239245 0.109049 2.194 0.02827 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.5623 on 9029 degrees of freedom
## Multiple R-squared: 0.4357, Adjusted R-squared: 0.4349
## F-statistic: 580.9 on 12 and 9029 DF, p-value: < 2.2e-16
res_rmse[res_rmse$Group==5,]$lm = rmse(predict(mdl_lm, newdata=train), train$raw_score__age_spline) # ADJUST GROUP
## Warning in predict.lm(mdl_lm, newdata = train): prediction from a rank-
## deficient fit may be misleading
set.seed(480)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 14
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.1"
## gamma "0.5"
## max_depth "5"
## min_child_weight "10"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score__age_spline
res_rmse[res_rmse$Group==5,]$xgb = rmse(pred, actual) # ADJUST GROUP
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab(expression(General~score~-~f[age])) +
ylab("XGBoost prediction")+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
### Variable importance
xgb.plot.importance(importance_matrix = xgb.importance(model = mdl_xgb))
set.seed(1123)
mdl_rf = randomForest(
formula = raw_score__age_spline ~ .,
data = train
)
res_rmse[res_rmse$Group==5,]$rf = rmse(mdl_rf$predicted, train$raw_score__age_spline) # ADJUST GROUP
mdl_svm = fit_svm(raw_score__age_spline ~ ., train, param_svm)
## Training on 27 sets of parameters.
## [1] "Best parameters:"
## 21
## type "eps-regression"
## cost "2"
## epsilon "0.5"
## gamma_scale "2"
## gamma "0.1428571"
res_rmse[res_rmse$Group==5,]$svm = rmse(mdl_svm$fitted, train$raw_score__age_spline) # ADJUST GROUP
rm(train, train_xgb, mdl_lm, mdl_xgb, mdl_rf)
knitr::kable(res_rmse)
| Group | lm | xgb | rf | svm |
|---|---|---|---|---|
| 1 | 0.5755271 | 0.5224769 | 0.5553091 | 0.5312492 |
| 2 | 0.5653542 | 0.5131807 | 0.5274510 | 0.5217062 |
| 3 | 0.5726338 | 0.5204973 | 0.5334502 | 0.5224062 |
| 4 | 0.5624640 | 0.5065970 | 0.5249744 | 0.5149892 |
| 5 | 0.5618528 | 0.4929722 | 0.5142440 | 0.5033342 |
We use the “Group 3” models but predict the raw score. Results from this section can be combined with Group 3 xgboost results above where we predicted the raw score minus the age lower bound.
### Create group 3 training data
## Select features and round count features
train = features_filt %>%
select(
p_current_age,
p_age_first_offense,
p_charge,
p_jail30,
p_prison,
p_probation,
raw_score)
## Format for xgboost
train_xgb = xgb.DMatrix(
"data" = train %>% select(-raw_score) %>% as.matrix(),
"label" = train %>% select(raw_score) %>% as.matrix()
)
set.seed(540)
mdl_xgb = fit_xgboost(train_xgb, param)
## Training on 16 sets of parameters.
## 5
## objective "reg:linear"
## eval_metric "rmse"
## eta "0.05"
## gamma "0.5"
## max_depth "5"
## min_child_weight "5"
## subsample "1"
## colsample_bytree "1"
### xgboost plot
pred = predict(mdl_xgb, newdata=train_xgb)
actual = train$raw_score
print(paste("RMSE:",round(rmse(pred, actual),4)))
## [1] "RMSE: 0.511"
axis_min = min(min(pred),min(actual))
axis_max = max(max(pred),max(actual))
data.frame(xgboost = pred, compas=actual) %>%
ggplot() +
geom_point(aes(x=compas,y=xgboost), alpha=.3) +
geom_abline(slope=1, color="red")+
xlim(c(axis_min,axis_max)) +
ylim(c(axis_min,axis_max)) +
coord_fixed() +
theme_bw()+
xlab("General score") +
ylab("XGBoost prediction")+
#annotate("text", x = -3.5, y = 0.5, label = paste("RMSE:",round(rmse(pred, actual),4)))+
theme(
text = element_text(size=14),
axis.text=element_text(size=14))
ggsave("Figures/raw_xgboost_gp3_general.pdf",width = 3, height = 3, units = "in")
propub = features_filt %>%
filter(filt4) %>% # Only people with valid recidivism values
mutate(age_low = if_else(p_current_age < 25,1,0),
age_high = if_else(p_current_age > 45,1,0),
female = if_else(sex=="Female",1,0),
n_priors = p_felony_count_person + p_misdem_count_person,
compas_high = if_else(`Risk of Recidivism_decile_score` >= 5, 1, 0), # Medium and High risk scores get +1 label
race = relevel(factor(race), ref="Caucasian")) # Base level is Caucasian, as in ProPublica analysis
print(paste("Number of observations for logistic regression:",nrow(propub)))
## [1] "Number of observations for logistic regression: 5759"
mdl_glm = glm(compas_high ~
female +
age_high +
age_low +
as.factor(race) +
p_charge +
is_misdem +
recid,
family=binomial(link='logit'), data=propub)
summary(mdl_glm)
##
## Call:
## glm(formula = compas_high ~ female + age_high + age_low + as.factor(race) +
## p_charge + is_misdem + recid, family = binomial(link = "logit"),
## data = propub)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7344 -0.7673 -0.3035 0.8381 2.6740
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.593179 0.082309 -19.356 < 2e-16 ***
## female 0.123516 0.085171 1.450 0.1470
## age_high -1.489311 0.129772 -11.476 < 2e-16 ***
## age_low 1.445909 0.071243 20.296 < 2e-16 ***
## as.factor(race)African-American 0.521072 0.072977 7.140 9.32e-13 ***
## as.factor(race)Asian -0.271728 0.503999 -0.539 0.5898
## as.factor(race)Hispanic -0.301324 0.130461 -2.310 0.0209 *
## as.factor(race)Native American 0.390718 0.678081 0.576 0.5645
## as.factor(race)Other -0.713647 0.159728 -4.468 7.90e-06 ***
## p_charge 0.155033 0.006521 23.773 < 2e-16 ***
## is_misdem -0.464124 0.069574 -6.671 2.54e-11 ***
## recid 0.491790 0.068811 7.147 8.87e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7907.2 on 5758 degrees of freedom
## Residual deviance: 5665.7 on 5747 degrees of freedom
## AIC: 5689.7
##
## Number of Fisher Scoring iterations: 5